In [112]:
# Author: Stephen Situ
# This is a project that explores the different ways data can be cleaned and pre processed for machine learning. The first method
# is just sequentially cleaning the data manually one step at a time. This is good for infrequent use. The next method is to 
# define a function and pass the dataframe into the function. The last and most preferred method is to create a pipeline that
# can utilize the scikit learn pipeline library. It sequentially applies a list of transformations and a final estimator.
# They must implement fit and transform methods. The final estimator only needs to implement fit. 
In [ ]:
# Import pandas and Numpy
import pandas as pd
import numpy as np
In [113]:
# Read CSV
df = pd.read_csv('cars_raw.csv')
In [114]:
# Head
df.head()
Out[114]:
Year Make Model Used/New Price ConsumerRating ConsumerReviews SellerType SellerName SellerRating ... InteriorColor Drivetrain MinMPG MaxMPG FuelType Transmission Engine VIN Stock# Mileage
0 2019 Toyota Sienna SE Used $39,998 4.6 45 Dealer CarMax Murrieta - Now offering Curbside Pickup... 3.3 ... Black Front-wheel Drive 19 27 Gasoline 8-Speed Automatic 3.5L V6 24V PDI DOHC 5TDXZ3DC2KS015402 22998646 29403
1 2018 Ford F-150 Lariat Used $49,985 4.8 817 Dealer Giant Chevrolet 4.8 ... Black Four-wheel Drive 19 24 Gasoline 10-Speed Automatic 3.5L V6 24V PDI DOHC Twin Turbo 1FTEW1EG2JFD44217 22418A 32929
2 2017 RAM 1500 Laramie Used $41,860 4.7 495 Dealer Gill Auto Group Madera 4.6 ... Black Four-wheel Drive 15 21 Gasoline 8-Speed Automatic 5.7L V8 16V MPFI OHV 1C6RR7VT5HS842283 NG277871G 23173
3 2021 Honda Accord Sport SE Used $28,500 5.0 36 Dealer AutoSavvy Las Vegas 4.6 ... – Front-wheel Drive 29 35 Gasoline Automatic CVT 1.5L I4 16V GDI DOHC Turbo 1HGCV1F49MA038035 54237 10598
4 2020 Lexus RX 350 Used $49,000 4.8 76 Dealer Lexus of Henderson 4.8 ... Birch Front-wheel Drive 20 27 Gasoline 8-Speed Automatic 3.5L V6 24V PDI DOHC 2T2AZMAA8LC156270 HDT4181A 28137

5 rows × 32 columns

In [115]:
# Describe
df.describe()
Out[115]:
Year ConsumerRating ConsumerReviews SellerRating SellerReviews ComfortRating InteriorDesignRating PerformanceRating ValueForMoneyRating ExteriorStylingRating ReliabilityRating MinMPG MaxMPG Mileage
count 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000 9379.000000
mean 2018.721719 4.702825 133.187014 4.412571 984.089988 4.771895 4.727391 4.696290 4.537083 4.782194 4.681746 22.755411 29.216548 37463.023350
std 2.221708 0.240795 154.985640 0.626258 1609.039864 0.217822 0.194391 0.253664 0.338098 0.171537 0.368161 14.812869 12.809783 24970.342569
min 2001.000000 2.500000 1.000000 1.000000 1.000000 3.000000 2.800000 1.000000 1.000000 3.000000 1.000000 0.000000 0.000000 121.000000
25% 2018.000000 4.700000 30.000000 4.300000 112.000000 4.700000 4.700000 4.600000 4.500000 4.700000 4.600000 18.000000 25.000000 18666.500000
50% 2019.000000 4.800000 75.000000 4.600000 542.000000 4.800000 4.800000 4.700000 4.600000 4.800000 4.800000 20.000000 27.000000 32907.000000
75% 2020.000000 4.800000 182.000000 4.800000 1272.000000 4.900000 4.800000 4.800000 4.700000 4.900000 4.900000 24.000000 31.000000 47698.000000
max 2022.000000 5.000000 817.000000 5.000000 27824.000000 5.000000 5.000000 5.000000 5.000000 5.000000 5.000000 150.000000 133.000000 234114.000000
In [116]:
# data types
df.dtypes
Out[116]:
Year                       int64
Make                      object
Model                     object
Used/New                  object
Price                     object
ConsumerRating           float64
ConsumerReviews            int64
SellerType                object
SellerName                object
SellerRating             float64
SellerReviews              int64
StreetName                object
State                     object
Zipcode                   object
DealType                  object
ComfortRating            float64
InteriorDesignRating     float64
PerformanceRating        float64
ValueForMoneyRating      float64
ExteriorStylingRating    float64
ReliabilityRating        float64
ExteriorColor             object
InteriorColor             object
Drivetrain                object
MinMPG                     int64
MaxMPG                     int64
FuelType                  object
Transmission              object
Engine                    object
VIN                       object
Stock#                    object
Mileage                    int64
dtype: object
In [117]:
# drop columns
df = df.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
In [118]:
# Head
df.head()
Out[118]:
Year Make Price ConsumerRating ConsumerReviews SellerType SellerRating SellerReviews State ComfortRating InteriorDesignRating PerformanceRating ValueForMoneyRating ExteriorStylingRating ReliabilityRating MinMPG MaxMPG Mileage
0 2019 Toyota $39,998 4.6 45 Dealer 3.3 3 CA 4.7 4.6 4.6 4.4 4.6 4.7 19 27 29403
1 2018 Ford $49,985 4.8 817 Dealer 4.8 131 CA 4.9 4.8 4.8 4.6 4.8 4.7 19 24 32929
2 2017 RAM $41,860 4.7 495 Dealer 4.6 249 CA 4.8 4.7 4.8 4.6 4.8 4.7 15 21 23173
3 2021 Honda $28,500 5.0 36 Dealer 4.6 284 NV 4.9 5.0 4.9 5.0 5.0 5.0 29 35 10598
4 2020 Lexus $49,000 4.8 76 Dealer 4.8 4755 NV 4.9 4.8 4.8 4.7 4.8 4.9 20 27 28137
In [119]:
# delete $ from string "Price"
# delete "Not Priced"
df["Price"] = df["Price"].str.replace("$","")
df["Price"] = df["Price"].str.replace(",","")
df["Price"] = df["Price"].str.replace("Not Priced","")
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\3117742891.py:3: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  df["Price"] = df["Price"].str.replace("$","")
In [120]:
# Cast Price to numeric
df["Price"] = pd.to_numeric(df["Price"])
In [121]:
# Impute price as mean
df['Price'].fillna((df['Price'].mean()), inplace=True)
In [122]:
# Hot encode Categirucal features
df = pd.get_dummies(df)
df.to_csv("clean_normal.csv",index=False)  
In [123]:
# Head
df.head
Out[123]:
<bound method NDFrame.head of       Year    Price  ConsumerRating  ConsumerReviews  SellerRating  \
0     2019  39998.0             4.6               45           3.3   
1     2018  49985.0             4.8              817           4.8   
2     2017  41860.0             4.7              495           4.6   
3     2021  28500.0             5.0               36           4.6   
4     2020  49000.0             4.8               76           4.8   
...    ...      ...             ...              ...           ...   
9374  2019  27374.0             4.7              205           4.4   
9375  2019  61998.0             4.8               27           4.8   
9376  2017  26944.0             4.8              137           4.7   
9377  2019  28568.0             4.7              279           4.4   
9378  2019  32091.0             4.8              204           4.4   

      SellerReviews  ComfortRating  InteriorDesignRating  PerformanceRating  \
0                 3            4.7                   4.6                4.6   
1               131            4.9                   4.8                4.8   
2               249            4.8                   4.7                4.8   
3               284            4.9                   5.0                4.9   
4              4755            4.9                   4.8                4.8   
...             ...            ...                   ...                ...   
9374            443            4.7                   4.7                4.6   
9375           1789            4.9                   4.8                4.8   
9376            831            4.9                   4.8                4.7   
9377            680            4.8                   4.7                4.6   
9378           1105            4.9                   4.9                4.6   

      ValueForMoneyRating  ...  State_TX  State_US-12  State_US-169  State_UT  \
0                     4.4  ...         0            0             0         0   
1                     4.6  ...         0            0             0         0   
2                     4.6  ...         0            0             0         0   
3                     5.0  ...         0            0             0         0   
4                     4.7  ...         0            0             0         0   
...                   ...  ...       ...          ...           ...       ...   
9374                  4.7  ...         0            0             0         0   
9375                  4.6  ...         0            0             0         0   
9376                  4.6  ...         0            0             0         0   
9377                  4.7  ...         0            0             0         0   
9378                  4.8  ...         0            0             0         0   

      State_VA  State_VT  State_WA  State_WI  State_WV  State_WY  
0            0         0         0         0         0         0  
1            0         0         0         0         0         0  
2            0         0         0         0         0         0  
3            0         0         0         0         0         0  
4            0         0         0         0         0         0  
...        ...       ...       ...       ...       ...       ...  
9374         0         0         0         0         0         0  
9375         0         0         0         0         0         0  
9376         0         0         0         0         0         0  
9377         0         0         0         0         0         0  
9378         0         0         0         0         0         0  

[9379 rows x 117 columns]>
In [124]:
# Create a function to preform the cleaning and transformation task and test it
def clean(X):
    X = X.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
    X["Price"] = X["Price"].str.replace("$","")
    X["Price"] = X["Price"].str.replace(",","")
    X["Price"] = X["Price"].str.replace("Not Priced","")
    X["Price"] = pd.to_numeric(X["Price"])
    X['Price'].fillna((X['Price'].mean()), inplace=True)
    X = pd.get_dummies(X)
    X.to_csv("clean_function.csv",index=False)  
    return(X)
In [125]:
# Should get the same result as before
df2 = pd.read_csv('cars_raw.csv')
clean(df2)
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\2668432355.py:4: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  X["Price"] = X["Price"].str.replace("$","")
Out[125]:
Year Price ConsumerRating ConsumerReviews SellerRating SellerReviews ComfortRating InteriorDesignRating PerformanceRating ValueForMoneyRating ... State_TX State_US-12 State_US-169 State_UT State_VA State_VT State_WA State_WI State_WV State_WY
0 2019 39998.0 4.6 45 3.3 3 4.7 4.6 4.6 4.4 ... 0 0 0 0 0 0 0 0 0 0
1 2018 49985.0 4.8 817 4.8 131 4.9 4.8 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
2 2017 41860.0 4.7 495 4.6 249 4.8 4.7 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
3 2021 28500.0 5.0 36 4.6 284 4.9 5.0 4.9 5.0 ... 0 0 0 0 0 0 0 0 0 0
4 2020 49000.0 4.8 76 4.8 4755 4.9 4.8 4.8 4.7 ... 0 0 0 0 0 0 0 0 0 0
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
9374 2019 27374.0 4.7 205 4.4 443 4.7 4.7 4.6 4.7 ... 0 0 0 0 0 0 0 0 0 0
9375 2019 61998.0 4.8 27 4.8 1789 4.9 4.8 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
9376 2017 26944.0 4.8 137 4.7 831 4.9 4.8 4.7 4.6 ... 0 0 0 0 0 0 0 0 0 0
9377 2019 28568.0 4.7 279 4.4 680 4.8 4.7 4.6 4.7 ... 0 0 0 0 0 0 0 0 0 0
9378 2019 32091.0 4.8 204 4.4 1105 4.9 4.9 4.6 4.8 ... 0 0 0 0 0 0 0 0 0 0

9379 rows × 117 columns

In [126]:
# Using sci-kit learn we can create a pipeline that uses estimators and transformers
from sklearn.base import BaseEstimator, TransformerMixin

class Columndropper(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self, X):
        return X.drop(["Model","SellerName","StreetName","Zipcode","DealType","ExteriorColor","InteriorColor","Transmission","Engine","VIN","Stock#","Used/New","FuelType","Drivetrain"],axis=1)
    
class PriceCleaner(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self
    def transform(self, X):  
        X["Price"] = X["Price"].str.replace("$","")
        X["Price"] = X["Price"].str.replace(",","")
        X["Price"] = X["Price"].str.replace("Not Priced","")
        return X
    
class CastNumeric(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self    
    def transform(self, X):
        X["Price"] = pd.to_numeric(X["Price"])
        return X
    
class Imputer(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self 
    def transform(self, X):   
        X['Price'].fillna((X['Price'].mean()), inplace=True)
        return X
    
class HotEncode(BaseEstimator, TransformerMixin):
    def fit(self,X,y=None):
        return self 
    def transform(self, X):     
        X = pd.get_dummies(X)
        return X
In [127]:
# Use a new dataframe, df3
df3 = pd.read_csv('cars_raw.csv')
In [128]:
# Call the functions
# now we can use classes/methods/attributes to do the transformation
column_drop = Columndropper()
price_cleaner = PriceCleaner()
cast_numeric = CastNumeric()
imputer = Imputer()
hot_encode = HotEncode()
df_clean = hot_encode.transform(imputer.transform(cast_numeric.transform(price_cleaner.transform(column_drop.transform(df3)))))
df_clean.head()
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\455890638.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  X["Price"] = X["Price"].str.replace("$","")
Out[128]:
Year Price ConsumerRating ConsumerReviews SellerRating SellerReviews ComfortRating InteriorDesignRating PerformanceRating ValueForMoneyRating ... State_TX State_US-12 State_US-169 State_UT State_VA State_VT State_WA State_WI State_WV State_WY
0 2019 39998.0 4.6 45 3.3 3 4.7 4.6 4.6 4.4 ... 0 0 0 0 0 0 0 0 0 0
1 2018 49985.0 4.8 817 4.8 131 4.9 4.8 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
2 2017 41860.0 4.7 495 4.6 249 4.8 4.7 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
3 2021 28500.0 5.0 36 4.6 284 4.9 5.0 4.9 5.0 ... 0 0 0 0 0 0 0 0 0 0
4 2020 49000.0 4.8 76 4.8 4755 4.9 4.8 4.8 4.7 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 117 columns

In [110]:
df4 = pd.read_csv('cars_raw.csv')
In [129]:
# Define pipe as a list of tuples 
from sklearn.pipeline import Pipeline
pipe = Pipeline([
    ("column_drop", Columndropper()),
    ("price_cleaner",  PriceCleaner()),
    ("cast_numeric", CastNumeric()),
    ("imputer",Imputer()),
    ("hot_encode",HotEncode())
])
In [130]:
# Use fit_transform and get the same result
df_pipe_cleaned = pipe.fit_transform(df4)
df_pipe_cleaned.to_csv("clean_pipeline.csv",index=False)  
df_pipe_cleaned.head()
C:\Users\Steve\AppData\Local\Temp\ipykernel_18592\455890638.py:14: FutureWarning: The default value of regex will change from True to False in a future version. In addition, single character regular expressions will *not* be treated as literal strings when regex=True.
  X["Price"] = X["Price"].str.replace("$","")
Out[130]:
Year Price ConsumerRating ConsumerReviews SellerRating SellerReviews ComfortRating InteriorDesignRating PerformanceRating ValueForMoneyRating ... State_TX State_US-12 State_US-169 State_UT State_VA State_VT State_WA State_WI State_WV State_WY
0 2019 39998.0 4.6 45 3.3 3 4.7 4.6 4.6 4.4 ... 0 0 0 0 0 0 0 0 0 0
1 2018 49985.0 4.8 817 4.8 131 4.9 4.8 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
2 2017 41860.0 4.7 495 4.6 249 4.8 4.7 4.8 4.6 ... 0 0 0 0 0 0 0 0 0 0
3 2021 28500.0 5.0 36 4.6 284 4.9 5.0 4.9 5.0 ... 0 0 0 0 0 0 0 0 0 0
4 2020 49000.0 4.8 76 4.8 4755 4.9 4.8 4.8 4.7 ... 0 0 0 0 0 0 0 0 0 0

5 rows × 117 columns